/* Copyright (c) 2003 The Nutch Organization. All rights reserved. */ /* Use subject to the conditions in http://www.nutch.org/LICENSE.txt. */ package net.nutch.db; import java.io.*; import java.net.*; import java.net.MalformedURLException; import net.nutch.io.*; import net.nutch.util.*; import net.nutch.net.UrlNormalizer; /********************************************* * This is the field in the Link Database. * Each row is a Link: * type name description * --------------------------------------------------------------- * byte VERSION - A byte indicating the version of this entry. * 128bit FROM_ID - The MD5 hash of the source of the link. * 64bit DOMAIN_ID - The 8-byte MD5Hash of the source's domain. * string TO_URL - The URL destination of the link. * string ANCHOR - The anchor text of the link. * boolean TARGET_HAS_OUTLINK - Whether the target of the link has outlinks. * * @author Mike Cafarella *************************************************/ public class Link implements WritableComparable { public static final int MAX_ANCHOR_LENGTH = NutchConf.getInt("db.max.anchor.length", 100); private final static byte VERSION_1 = 1; private final static byte VERSION_2 = 2; private final static byte CUR_VERSION = 5; private MD5Hash fromID; private UTF8 url; private long domainID; private UTF8 anchor; private boolean targetHasOutlink; /** * Create the Link with no data */ public Link() { this.fromID = new MD5Hash(); this.url = new UTF8(); this.domainID = 0; this.anchor = new UTF8(); this.targetHasOutlink = false; } /** * Create the record */ public Link(MD5Hash fromID, long domainID, String urlString, String anchorText) throws MalformedURLException { this.fromID = fromID; this.url = new UTF8(UrlNormalizer.normalize(urlString)); this.domainID = domainID; // truncate long anchors if (anchorText.length() > MAX_ANCHOR_LENGTH) anchorText = anchorText.substring(0, MAX_ANCHOR_LENGTH); this.anchor = new UTF8(anchorText); this.targetHasOutlink = false; } /** * Read in fields from a bytestream */ public void readFields(DataInput in) throws IOException { byte version = in.readByte(); if (version > CUR_VERSION) throw new VersionMismatchException(CUR_VERSION, version); if (fromID == null) fromID = new MD5Hash(); fromID.readFields(in); if (url == null) url = new UTF8(); url.readFields(in); // 'domainID' was addded in Version 4 domainID = (version > 4) ? in.readLong() : 0; if (anchor == null) anchor = new UTF8(); anchor.readFields(in); // 'targetHasOutlink' added in Version 3. targetHasOutlink = (version > 3) ? in.readBoolean() : false; } /** */ public void set(Link that) { this.fromID.set(that.fromID); this.url.set(that.url); this.domainID = that.getDomainID(); this.anchor.set(that.anchor); this.targetHasOutlink = that.targetHasOutlink; } /** * Write bytes out to stream */ public void write(DataOutput out) throws IOException { out.write(CUR_VERSION); fromID.write(out); url.write(out); out.writeLong(domainID); anchor.write(out); out.writeBoolean(targetHasOutlink); } public static Link read(DataInput in) throws IOException { Link lr = new Link(); lr.readFields(in); return lr; } // // Accessors // public MD5Hash getFromID() { return fromID; } public UTF8 getURL() { return url; } public long getDomainID() { return domainID; } public UTF8 getAnchorText() { return anchor; } public boolean targetHasOutlink() { return targetHasOutlink; } public void setTargetHasOutlink(boolean targetHasOutlink) { this.targetHasOutlink = targetHasOutlink; } /** * Print out the record */ public String toString() { StringBuffer buf = new StringBuffer(); buf.append("Version: " + CUR_VERSION + "\n"); buf.append("ID: " + getFromID() + "\n"); buf.append("DomainID: " + getDomainID() + "\n"); buf.append("URL: " + getURL() + "\n"); buf.append("AnchorText: " + getAnchorText() + "\n"); buf.append("targetHasOutlink: " + targetHasOutlink() + "\n"); return buf.toString(); } /** * Get a tab-delimited version of the text data. */ public String toTabbedString() { StringBuffer buf = new StringBuffer(); buf.append("" + CUR_VERSION); buf.append("\t"); buf.append(getFromID().toString()); buf.append("\t"); buf.append(getDomainID()); buf.append("\t"); buf.append(getURL()); buf.append("\t"); buf.append(getAnchorText()); buf.append("\t"); buf.append(targetHasOutlink()); buf.append("\t"); return buf.toString(); } /** */ public int compareTo(Object o) { return urlCompare(o); } /** * Compare URLs, then compare MD5s. */ public int urlCompare(Object o) { int urlResult = this.url.compareTo(((Link) o).url); if (urlResult != 0) { return urlResult; } return this.fromID.compareTo(((Link) o).fromID); } /** * Compare MD5s, then compare URLs. */ public int md5Compare(Object o) { int md5Result = this.fromID.compareTo(((Link) o).fromID); if (md5Result != 0) { return md5Result; } return this.url.compareTo(((Link) o).url); } /** * URLComparator uses the standard method where, uh, * the URL comes first. */ public static class UrlComparator extends WritableComparator { public UrlComparator() { super(Link.class); } public int compare(WritableComparable a, WritableComparable b) { return ((Link) a).urlCompare(b); } /** Optimized comparator. */ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { int md5Start1 = s1 + 1; // skip version int md5Start2 = s2 + 1; int urlLenStart1 = md5Start1 + MD5Hash.MD5_LEN; int urlLenStart2 = md5Start2 + MD5Hash.MD5_LEN; int urlLen1 = readUnsignedShort(b1, urlLenStart1); int urlLen2 = readUnsignedShort(b2, urlLenStart2); int urlStart1 = urlLenStart1+2; int urlStart2 = urlLenStart2+2; // compare urls int c = compareBytes(b1, urlStart1, urlLen1, b2, urlStart2, urlLen2); if (c != 0) return c; // compare md5s return compareBytes(b1, md5Start1, MD5Hash.MD5_LEN, b2, md5Start2, MD5Hash.MD5_LEN); } } /** * MD5Comparator is the opposite. */ public static class MD5Comparator extends WritableComparator { public MD5Comparator() { super(Link.class); } public int compare(WritableComparable a, WritableComparable b) { return ((Link) a).md5Compare(b); } /** Optimized comparator. */ public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { // compare md5s int md5Start1 = s1 + 1; // skip version int md5Start2 = s2 + 1; int c = compareBytes(b1, md5Start1, MD5Hash.MD5_LEN, b2, md5Start2, MD5Hash.MD5_LEN); if (c != 0) return c; // compare urls int urlLenStart1 = md5Start1 + MD5Hash.MD5_LEN; int urlLenStart2 = md5Start2 + MD5Hash.MD5_LEN; int urlLen1 = readUnsignedShort(b1, urlLenStart1); int urlLen2 = readUnsignedShort(b2, urlLenStart2); int urlStart1 = urlLenStart1+2; int urlStart2 = urlLenStart2+2; return compareBytes(b1, urlStart1, urlLen1, b2, urlStart2, urlLen2); } } }